Supercias Companies Analysis#

First we are goint to ge the date from the supercias ranking website

packages = ['seaborn', 'matplotlib', 'numpy', 'altair', 'pandas', 'numpy', 'os', 'pymongo', 'gridfs']
for package in packages:
    try:
        __import__(package)
    except ImportError:
        !pip install {package}

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import altair as alt
import os 

# set the default renderer to vega
alt.data_transformers.enable("vegafusion")
alt.renderers.enable('default')
# Set display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

#check if the following files are in the Downloads folder
domain = 'https://appscvsmovil.supercias.gob.ec/ranking/recursos/'
# the user should change the folder path to the one where the files are located
folder = '/home/' +  os.environ['USER'] + '/Downloads/'
ranking_path = folder + 'bi_ranking.csv' if 'bi_ranking.csv' in os.listdir(folder) else domain + 'bi_ranking.csv'
ids_path = folder + 'bi_compania.csv' if 'bi_compania.csv' in os.listdir(folder) else domain + 'bi_compania.csv'
segmentos_path = folder + 'bi_segmento.csv' if 'bi_segmento.csv' in os.listdir(folder) else domain + 'bi_segmento.csv'
ciiu_path = folder + 'bi_ciiu.csv' if 'bi_ciiu.csv' in os.listdir(folder) else domain + 'bi_ciiu.csv'

# read from a csv file into a pd dataframe
df_ranking = pd.read_csv(ranking_path, low_memory=False)
# the companias string 
df_ids = pd.read_csv(ids_path, low_memory=False)
# segementos
df_segmentos = pd.read_csv(segmentos_path, low_memory=False)
# Código de Clasificacón Industrial Internacional Unifrome
df_ciiu = pd.read_csv(ciiu_path, low_memory=False)

We also want to read from the the mondo BD and get all fo teh documets from each company#

from pymongo import MongoClient
from gridfs import GridFS

# Connect to MongoDB on local host with ip address
endpoint = 'mongodb://10.0.10.5:27017'
db = MongoClient(endpoint)['supercias_ranking']
#companies = db['companies_suggestions']

# check the that here is a mongodb collection
#print(companies.count_documents({}))

Let’s only get the year 2023 and make the passivos by subtracting the patrimonio from the activos#

let’s merge all of the df into one#

# get the anio with 2023
df_ranking = df_ranking[df_ranking['anio'] == 2023]

# rename the cuii column 
df_ciiu = df_ciiu.rename(columns={'descripcion': 'ciiu_desc', 'ciiu': 'ciiu_code'}) 
df_ciiu['ciiu_code'] = df_ciiu['ciiu_code'].str.strip()

# match all of the expedientes in the df_ids with the expedientes in the df_ranking 
df = pd.merge(df_ranking, df_ids, on='expediente', how='left')

# let's merge the ciiu and the segments with the 
df = pd.merge(df, df_ciiu, left_on='ciiu_n1', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n1_code', 'ciiu_desc': 'ciiu_n1_desc'}, inplace=True)
df = pd.merge(df, df_ciiu, left_on='ciiu_n6', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n6_code', 'ciiu_desc': 'ciiu_n6_desc'}, inplace=True)

# get the passivos by subtracting patrimonio from activos
df['pasivos'] = df['activos'] - df['patrimonio']

# check that there are error where activos is not equal to passivos + patrimonio
#df_error = df[df['activos'] != df['pasivos'] + df['patrimonio']]

# get the name and the passivos columns only
#df_error = df_error[['nombre', 'activos', 'pasivos', 'patrimonio']]

#df_error['margin'] = df_error['activos'] - df_error['pasivos'] - df_error['patrimonio']

# df print nu
#df_error.head()

#print(df.columns)

let seperate the companies into categoris by the number of activos#

How we want ot to know that is the ditribution of

# make companies categories between 0 and 2.5 billion in assets
# small companies: 0 - 500 thousand
# medium companies: 500 thousand - 50 million
# large companies: 50 million - up 

df_small = df[df['activos'] < 500000]

df_medium = df[(df['activos'] >= 500000) & (df['activos'] < 50000000)]

df_large = df[(df['activos'] >= 50000000)]

# get the number of companies in each category and make a pie chart
pie_df = pd.DataFrame({
    'size': ['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
    'count': [df_small.shape[0], df_medium.shape[0], df_large.shape[0]]
})

pie_df['percentage'] = (pie_df['count'] / pie_df['count'].sum()) * 100

# Define custom colors for each category using named colors
color_scale = alt.Scale(domain=['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50m- up'],
    range=['steelblue', 'orange', 'mediumseagreen' ])

pie_df.head()

pie_chart = alt.Chart(pie_df).mark_arc().encode(
    theta=alt.Theta(field="count", type="quantitative"),
    color=alt.Color(field="size", type="nominal", scale=color_scale),
    tooltip=['size', 'count', alt.Tooltip('percentage:Q', format='.1f', title='percentage')]
).interactive()

pie_chart.display()
def make_pie_chart(df, title):
    type_counts = df['tipo'].value_counts().reset_index()
    type_counts.columns = ['tipo', 'count']
    type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100

    return alt.Chart(type_counts).mark_arc().encode(
        theta=alt.Theta(field="count", type="quantitative"),
        color=alt.Color(field="tipo", type="nominal"),
        tooltip=['tipo', 'count', alt.Tooltip('percentage:Q', format='.1f',title='percentage')]
        # show percentge in legentd
    ).interactive().properties(
        title=title
    )

all_pie = make_pie_chart(df, 'All Companies')
small_pie = make_pie_chart(df_small, 'Small Companies')
medium_pie = make_pie_chart(df_medium, 'Medium Companies')
large_pie = make_pie_chart(df_large, 'Large Companies')


# Combine the charts into two columns
column1 = alt.vconcat(all_pie, small_pie)
column2 = alt.vconcat(medium_pie, large_pie)

# Combine the charts
# make the legend independent
# make two columns
chart = alt.hconcat(
    column1, column2
).resolve_legend(
    size="independent"
).properties(
    title='Distribution of Companies types by Size',
)
chart.show()
def make_chart(df, title='Activos de Companias', max_bins=100, color='steelblue', width=450, height=300):
    # Create a selection
    click = alt.selection_point(encodings=['x'])
    
    # Create the right bar chart
    bars = alt.Chart(df).mark_bar().encode(
        x=alt.X('activos:Q', bin=alt.Bin(maxbins=max_bins), title='Activos'),
        y=alt.Y( 'count()', title='Numero de companias'),
        color=alt.condition(click, alt.ColorValue(color), alt.ColorValue(color)),
        tooltip=['count():Q']
    ).add_params(
        click
    ).properties(
        width=width,
        height=height,
        title=title,
    ).interactive()

    # Create the left chart (scatter plot)
    scatter = alt.Chart(df).mark_circle().encode(
        x='pasivos:Q',
        y='activos:Q',
        color='ciiu_n1_desc:N',
        tooltip=['nombre:N', 'activos:Q', 'pasivos:Q', 'patrimonio:Q', 'ciiu_n1_desc:N']
    ).transform_filter(
        click
    ).properties(
        width=width,
        height=height,
        title=''
    ).interactive()
    
    # Filter the left chart based on selection
    filtered_scatter = scatter.transform_filter(
        click
    )
    
    # Combine the charts
    chart = alt.vconcat(filtered_scatter, bars).resolve_legend(
    )
    chart.show()
make_chart(df_small, title='Activos de Companias pequenas', color='steelblue')
make_chart(df_medium, title='Activos de Companias Medianas', color='orange')
make_chart(df_large, title='Activos de Companias Grandes', color='mediumseagreen')

#

# save the df into the mongodb db called rankings in the supercias collection
rankings = db['rankings']

#rankings.insert_many(df.to_dict('records'))